// /*
//  * Copyright © 2014-2015 Broadcom
//  *
//  * Permission is hereby granted, free of charge, to any person obtaining a
//  * copy of this software and associated documentation files (the "Software"),
//  * to deal in the Software without restriction, including without limitation
//  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
//  * and/or sell copies of the Software, and to permit persons to whom the
//  * Software is furnished to do so, subject to the following conditions:
//  *
//  * The above copyright notice and this permission notice (including the next
//  * paragraph) shall be included in all copies or substantial portions of the
//  * Software.
//  *
//  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
//  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
//  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
//  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
//  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
//  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
//  * IN THE SOFTWARE.
//  */

// /** @file vc4_job.c
//  *
//  * Functions for submitting VC4 render jobs to the kernel.
//  */
#include "ralloc.h"
#include <xf86drm.h>
#include "vc4_drm.h"

#include "util/hash_table.h"

#include "broadcom/common/v3d_device_info.h"
#include "broadcom/cle/v3d_decoder.h"
#include "broadcom/clif/clif_dump.h"

#include "vc4_vk_common.h"
#include "vc4_private.h"
#include "vc4_vk_formats.h"
#include "vc4_vk_job.h"

#include "vc4_memory.h"

void
vc4_job_free(struct vc4_vk_job *job)
{
        struct vc4_device *device = job->device;

	vk_free(&device->vk.alloc, job->bcl.base);
	vk_free(&device->vk.alloc, job->shader_rec.base);
	vk_free(&device->vk.alloc, job->uniforms.base);
	vk_free(&device->vk.alloc, job->bo_handles.base);
	vk_free(&device->vk.alloc, job->bo_pointers.base);

	vk_free(&device->vk.alloc, job);
}

struct vc4_vk_job *
vc4_job_create(struct vc4_device *device)
{
    struct vc4_vk_job *job = vk_zalloc(&device->vk.alloc,
                                       sizeof(struct vc4_vk_job), 8,
                                       VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!job) {
        vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
        return NULL;
    }

    job->device = device;

    vc4_init_cl(job, &job->bcl);
    vc4_init_cl(job, &job->shader_rec);
    vc4_init_cl(job, &job->uniforms);
    vc4_init_cl(job, &job->bo_handles);
    vc4_init_cl(job, &job->bo_pointers);

    job->draw_min_x = ~0;
    job->draw_min_y = ~0;
    job->draw_max_x = 0;
    job->draw_max_y = 0;

    job->last_gem_handle_hindex = ~0;

    // if (vc4->perfmon)
    // 		job->perfmon = vc4->perfmon;

    return job;
}

void
vc4_dump_cl(void *cl, uint32_t size, bool is_render)
{
        struct v3d_device_info devinfo = {
                /* While the driver supports V3D 2.1 and 2.6, we haven't split
                 * off a 2.6 XML yet (there are a couple of fields different
                 * in render target formatting)
                 */
                .ver = 21,
        };
        struct v3d_spec *spec = v3d_spec_load(&devinfo);

        struct clif_dump *clif = clif_dump_init(&devinfo, stderr, true);

        uint32_t offset = 0, hw_offset = 0;
        uint8_t *p = cl;

        while (offset < size) {
                struct v3d_group *inst = v3d_spec_find_instruction(spec, p);
                uint8_t header = *p;
                uint32_t length;

                if (inst == NULL) {
                        fprintf(stderr, "0x%08x 0x%08x: Unknown packet 0x%02x (%d)!\n",
                                offset, hw_offset, header, header);
                        return;
                }

                length = v3d_group_get_length(inst);

                fprintf(stderr, "0x%08x 0x%08x: 0x%02x %s\n",
                        offset, hw_offset, header, v3d_group_get_name(inst));

                v3d_print_group(clif, inst, offset, p);

                switch (header) {
                case VC4_PACKET_HALT:
                case VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF:
                        return;
                default:
                        break;
                }

                offset += length;
                if (header != VC4_PACKET_GEM_HANDLES)
                        hw_offset += length;
                p += length;
        }

        clif_dump_destroy(clif);
}

static int vc4_wait_seqno_ioctl(int fd, uint64_t seqno, uint64_t timeout_ns)
{
        struct drm_vc4_wait_seqno wait = {
                .seqno = seqno,
                .timeout_ns = timeout_ns,
        };
        int ret = drmIoctl(fd, DRM_IOCTL_VC4_WAIT_SEQNO, &wait);
        if (ret == -1)
                return -errno;
        else
                return 0;

}

static
bool vc4_wait_seqno(struct vc4_device *device, uint64_t seqno, uint64_t timeout_ns,
               const char *reason)
{
        if (device->finished_seqno >= seqno)
                return true;

        if (unlikely(vc4_debug & VC4_DEBUG_PERF) && timeout_ns && reason) {
                if (vc4_wait_seqno_ioctl(device->fd, seqno, 0) == -ETIME) {
                        fprintf(stderr, "Blocking on seqno %lld for %s\n",
                                (long long)seqno, reason);
                }
        }

        int ret = vc4_wait_seqno_ioctl(device->fd, seqno, timeout_ns);
        if (ret) {
                if (ret != -ETIME) {
                        fprintf(stderr, "wait failed: %d\n", ret);
                        abort();
                }

                return false;
        }

        device->finished_seqno = seqno;
        return true;
}

static void
vc4_submit_setup_rcl_surface(struct vc4_vk_job *job,
                             struct drm_vc4_submit_rcl_surface *submit_surf,
                             struct vc4_image *image,
                             bool is_depth, bool is_write)
{
        if (!image)
                return;

        submit_surf->hindex = vc4_gem_hindex(job, &image->mem->bo);
        submit_surf->offset = image->mem_offset;

        if (image->samples <= 1)
        {
                if (is_depth) {
                        submit_surf->bits = VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_ZS,
                                                          VC4_LOADSTORE_TILE_BUFFER_BUFFER);
                } else {
                        submit_surf->bits = VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_COLOR,
                                                          VC4_LOADSTORE_TILE_BUFFER_BUFFER) |
                                            VC4_SET_FIELD(vc4_rt_format_is_565(image->vk_format) ? VC4_LOADSTORE_TILE_BUFFER_BGR565 : VC4_LOADSTORE_TILE_BUFFER_RGBA8888,
                                                          VC4_LOADSTORE_TILE_BUFFER_FORMAT);
                }

                switch (image->tiling) {
                case VK_IMAGE_TILING_OPTIMAL:
                        submit_surf->bits |= VC4_SET_FIELD(VC4_TILING_FORMAT_T,
                                                           VC4_LOADSTORE_TILE_BUFFER_TILING);
                        break;
                case VK_IMAGE_TILING_LINEAR:
                        submit_surf->bits |= VC4_SET_FIELD(VC4_TILING_FORMAT_LINEAR,
                                                           VC4_LOADSTORE_TILE_BUFFER_TILING);
                        break;
                default:
                        fprintf(stderr, "%s: unsupport tiling format\n", __func__);
                        break;
                }
        } else {
                assert(!is_write);
                submit_surf->flags |= VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES;
        }

        // if (is_write)
        //         rsc->writes++;
}

static void
vc4_submit_setup_rcl_render_config_surface(struct vc4_vk_job *job,
                                           struct drm_vc4_submit_rcl_surface *submit_surf,
                                           struct vc4_image *image)
{
        if (!image)
                return;

        submit_surf->hindex = vc4_gem_hindex(job, &image->mem->bo);
        submit_surf->offset = image->mem_offset;

        if (image->samples <= 1) {
                submit_surf->bits =
                    VC4_SET_FIELD(vc4_rt_format_is_565(image->vk_format) ? VC4_RENDER_CONFIG_FORMAT_BGR565 : VC4_RENDER_CONFIG_FORMAT_RGBA8888,
                                  VC4_RENDER_CONFIG_FORMAT);

                switch (image->tiling) {
                case VK_IMAGE_TILING_OPTIMAL:
                        submit_surf->bits |= VC4_SET_FIELD(VC4_TILING_FORMAT_T,
                                                           VC4_RENDER_CONFIG_MEMORY_FORMAT);
                        break;
                case VK_IMAGE_TILING_LINEAR:
                        submit_surf->bits |= VC4_SET_FIELD(VC4_TILING_FORMAT_LINEAR,
                                                           VC4_RENDER_CONFIG_MEMORY_FORMAT);
                        break;
                default:
                        fprintf(stderr, "%s: unsupport tiling format\n", __func__);
                        break;
                }
        }

        // rsc->writes++;
}

static void
vc4_submit_setup_rcl_msaa_surface(struct vc4_vk_job *job,
                                  struct drm_vc4_submit_rcl_surface *submit_surf,
                                  struct vc4_image *image)
{
        if (!image)
                return;

        submit_surf->hindex = vc4_gem_hindex(job, &image->mem->bo);
        submit_surf->offset = image->mem_offset;
        submit_surf->bits = 0;
        // rsc->writes++;
}

// #define SUBMIT_BO_DUMP_DEBUG

#ifdef SUBMIT_BO_DUMP_DEBUG

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

static int dump_seqnum = 0;

static void dump_to_file(char *name, uint32_t index, const char *type, void *buf, uint32_t size)
{
        char fname[64];

        sprintf(fname, "%04d_%s_%02u_%s", dump_seqnum, name, index, type);

        int fd = open(fname, O_CREAT|O_TRUNC|O_RDWR);
        if (fd < 0) {
                printf("create file fail\n");
                abort();
        }

        if(write(fd, buf, size) < -1) {
                printf("write file fail\n");
                abort();
        }

        close(fd);
}

static void bo_dump(struct vc4_bo *bo, uint32_t hindex)
{
        dump_to_file("VK", hindex, "bo", bo->map, bo->size);
}
#endif

void
vc4_vk_build_submit_job(struct vc4_vk_job *job)
{
        struct drm_vc4_submit_cl *submit = &job->submit;
        if (!job->needs_flush)
                return;

        /* The RCL setup would choke if the draw bounds cause no drawing, so
         * just drop the drawing if that's the case.
         */
        if (job->draw_max_x <= job->draw_min_x ||
            job->draw_max_y <= job->draw_min_y) {
                return;
        }

        if (vc4_debug & VC4_DEBUG_CL) {
                fprintf(stderr, "BCL:\n");
                vc4_dump_cl(job->bcl.base, cl_offset(&job->bcl), false);
        }

        if (cl_offset(&job->bcl) > 0) {
                /* Increment the semaphore indicating that binning is done and
                 * unblocking the render thread.  Note that this doesn't act
                 * until the FLUSH completes.
                 */
                cl_ensure_space(&job->bcl, 8);
                cl_emit(&job->bcl, INCREMENT_SEMAPHORE, incr);
                /* The FLUSH caps all of our bin lists with a
                 * VC4_PACKET_RETURN.
                 */
                cl_emit(&job->bcl, FLUSH, flush);
        }

        submit->color_read.hindex = ~0;
        submit->zs_read.hindex = ~0;
        submit->color_write.hindex = ~0;
        submit->msaa_color_write.hindex = ~0;
        submit->zs_write.hindex = ~0;
        submit->msaa_zs_write.hindex = ~0;

        cl_ensure_space(&job->bo_handles, 6 * sizeof(uint32_t));
        cl_ensure_space(&job->bo_pointers, 6 * sizeof(struct vc4_bo *));

        if (job->resolve & PIPE_CLEAR_COLOR) {
                if (!(job->cleared & PIPE_CLEAR_COLOR)) {
                        vc4_submit_setup_rcl_surface(job, &submit->color_read,
                                                     job->color_read,
                                                     false, false);
                }
                vc4_submit_setup_rcl_render_config_surface(job,
                                                           &submit->color_write,
                                                           job->color_write);
                vc4_submit_setup_rcl_msaa_surface(job,
                                                  &submit->msaa_color_write,
                                                  job->msaa_color_write);
        }
        if (job->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
                if (!(job->cleared & PIPE_CLEAR_DEPTHSTENCIL)) {
                        vc4_submit_setup_rcl_surface(job, &submit->zs_read,
                                                     job->zs_read, true, false);
                }
                vc4_submit_setup_rcl_surface(job, &submit->zs_write,
                                             job->zs_write, true, true);
                vc4_submit_setup_rcl_msaa_surface(job, &submit->msaa_zs_write,
                                                  job->msaa_zs_write);
        }

        if (job->msaa) {
                /* This bit controls how many pixels the general
                 * (i.e. subsampled) loads/stores are iterating over
                 * (multisample loads replicate out to the other samples).
                 */
                submit->color_write.bits |= VC4_RENDER_CONFIG_MS_MODE_4X;
                /* Controls whether color_write's
                 * VC4_PACKET_STORE_MS_TILE_BUFFER does 4x decimation
                 */
                submit->color_write.bits |= VC4_RENDER_CONFIG_DECIMATE_MODE_4X;
        }

        submit->bo_handles = (uintptr_t)job->bo_handles.base;
        submit->bo_handle_count = cl_offset(&job->bo_handles) / 4;
#ifdef SUBMIT_BO_DUMP_DEBUG
        printf("BO DUMP:\n");
        for (int k = 0; k < submit->bo_handle_count; k++) {
                uint32_t *hptr = (uint32_t *)job->bo_handles.base;
                uint32_t handle = hptr[k];

                struct vc4_bo **referenced_bos = job->bo_pointers.base;
                struct vc4_bo *bo = referenced_bos[k];
                if (bo->handle != handle) {
                        printf("-- handle mismatch !\n");
                        abort();
                }

                if (!bo->map) {
                        vc4_shader_bo_map(job->device, bo);
                        bo_dump(bo, k);
                        vc4_bo_unmap(job->device, bo);
                } else {
                        bo_dump(bo, k);
                }
        }
#endif

        submit->bin_cl = (uintptr_t)job->bcl.base;
        submit->bin_cl_size = cl_offset(&job->bcl);
#ifdef SUBMIT_BO_DUMP_DEBUG
        printf("BIN DUMP:\n");
        dump_to_file("BIN", 0, "bcl", job->bcl.base, submit->bin_cl_size);
#endif

        submit->shader_rec = (uintptr_t)job->shader_rec.base;
        submit->shader_rec_size = cl_offset(&job->shader_rec);
        submit->shader_rec_count = job->shader_rec_count;
#ifdef SUBMIT_BO_DUMP_DEBUG
        printf("SHADER REC DUMP:\n");
        dump_to_file("SHA", 0, "rec", job->shader_rec.base, submit->shader_rec_size);
#endif

        submit->uniforms = (uintptr_t)job->uniforms.base;
        submit->uniforms_size = cl_offset(&job->uniforms);
#ifdef SUBMIT_BO_DUMP_DEBUG
        printf("UNIFORM DUMP:\n");
        dump_to_file("UNI", 0, "uniform", job->uniforms.base, submit->uniforms_size);
#endif

	if (job->perfmon)
		submit->perfmonid = job->perfmon->id;

        assert(job->draw_min_x != ~0 && job->draw_min_y != ~0);
        submit->min_x_tile = job->draw_min_x / job->tile_width;
        submit->min_y_tile = job->draw_min_y / job->tile_height;
        submit->max_x_tile = (job->draw_max_x - 1) / job->tile_width;
        submit->max_y_tile = (job->draw_max_y - 1) / job->tile_height;
        submit->width = job->draw_width;
        submit->height = job->draw_height;
        if (job->cleared) {
                submit->flags |= VC4_SUBMIT_CL_USE_CLEAR_COLOR;
                submit->clear_color[0] = job->clear_color[0];
                submit->clear_color[1] = job->clear_color[1];
                submit->clear_z = job->clear_depth;
                submit->clear_s = job->clear_stencil;
        }
        submit->flags |= job->flags;

#ifdef SUBMIT_BO_DUMP_DEBUG
        dump_seqnum++;
#endif
}

/**
 * Submits the job to the kernel.
 */
void
vc4_vk_job_submit(struct vc4_queue *vc4, struct vc4_vk_job *job)
{
        struct drm_vc4_submit_cl *submit = &job->submit;

        if (vc4->device->has_syncobj) {
                submit->out_sync = vc4->job_syncobj;

                if (vc4->sem && vc4->sem->fd >= 0) {
                        /* This replaces the fence in the syncobj. */
                        drmSyncobjImportSyncFile(vc4->device->fd, vc4->sem->sync,
                                                 vc4->sem->fd);
                        submit->in_sync = vc4->sem->sync;
                }
        }

        if (!(vc4_debug & VC4_DEBUG_NORAST)) {
                int ret;

                ret = drmIoctl(vc4->device->fd, DRM_IOCTL_VC4_SUBMIT_CL, submit);
                static bool warned = false;
                if (ret && !warned) {
                        fprintf(stderr, "Draw call returned %s.  "
                                        "Expect corruption.\n", strerror(errno));
                        warned = true;
                } else if (!ret) {
                        vc4->last_emit_seqno = submit->seqno;
                        if (job->perfmon)
                                job->perfmon->last_seqno = submit->seqno;
                }
        }

        if (vc4->last_emit_seqno - vc4->device->finished_seqno > 5) {
                if (!vc4_wait_seqno(vc4->device,
                                    vc4->last_emit_seqno - 5,
                                    PIPE_TIMEOUT_INFINITE,
                                    "job throttling")) {
                        fprintf(stderr, "Job throttling failed\n");
                }
        }

        if (vc4_debug & VC4_DEBUG_ALWAYS_SYNC) {
                if (!vc4_wait_seqno(vc4->device, vc4->last_emit_seqno,
                                    PIPE_TIMEOUT_INFINITE, "sync")) {
                        fprintf(stderr, "Wait failed.\n");
                        abort();
                }

#ifdef SUBMIT_BO_DUMP_DEBUG
                printf("FRAMEBUFFER DUMP:\n");
                if (job->color_write) {
                        if (!job->color_write->mem->bo.map) {
                                vc4_bo_map(vc4->device, &job->color_write->mem->bo);
                                bo_dump(&job->color_write->mem->bo, 98);
                                vc4_bo_unmap(vc4->device, &job->color_write->mem->bo);
                        } else {
                                bo_dump(&job->color_write->mem->bo, 98);
                        }
                }

                // if (job->zs_write) {
                //         if (!job->zs_write->mem->bo.map) {
                //                 vc4_bo_map(vc4->device, &job->zs_write->mem->bo);
                //                 bo_dump(&job->zs_write->mem->bo, 99);
                //                 vc4_bo_unmap(vc4->device, &job->zs_write->mem->bo);
                //         } else {
                //                 bo_dump(&job->zs_write->mem->bo, 99);
                //         }
                // }
#endif
        }
}
